#imports
import nltk
from nltk.text import Text, TextCollection
import pandas as pd
from utils.data.readCorpus import NltkCorpusFromDir
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
#prepare the corpus
latinise = NltkCorpusFromDir(root="/home/krzys/Kod/streamlit/voces/data/corpora/latinise_IT_lemmas", fileids=r".*\.txt")
latinise_docs = []
for fileid in latinise.fileids():
latinise_docs.append(Text(latinise.words(fileid)))
latinise_collection = TextCollection(latinise_docs)
# Terms
terms = ['gens', 'natio', 'civitas', 'populus', 'urbs']
# Fix colors for plotting purposes
color_discrete_map_terms = { term : px.colors.qualitative.Plotly[i] for i, term in enumerate(terms)}
#corpus metadata
filenames = latinise.fileids()
filenames = pd.DataFrame([(fname, fname.split('_')[2]) for fname in filenames], columns=["filename","id"])
metadata = pd.read_csv("/media/HOME_FOLDERS/krzys/Kod/lvlt22/BMG/latinise_metadata.csv", index_col="id")
metadata = metadata.merge(filenames,on="id")
metadata = metadata.drop_duplicates('id')
metadata = metadata.set_index('filename')
bins = range(-450,951,200)
labels = [ '-'.join([str(bin1), str(bin2)]) for bin1, bin2 in zip(bins[0:len(bins)-1],bins[1:len(bins)])]
metadata["period"] = pd.cut(metadata["date"], bins, labels=labels, include_lowest=True)
metadata["no_tokens"] = [ len(latinise.words(filename)) if filename in latinise.fileids() else 0 in filename for filename in metadata.index.tolist() ]
# year by year frequency
dates = metadata["date"].unique()
cfd_year = nltk.ConditionalFreqDist()
for date in dates:
condition = date
for word in latinise.words(metadata[metadata["date"] == date].index):
cfd_year[condition][word] +=1
#count tokens by year
freq_by_year = pd.DataFrame()
freq_by_year["year"] = pd.to_numeric(cfd_year.conditions())
freq_by_year["count"] = [ sum(cfd_year[year].values()) for year in freq_by_year["year"] ]
freq_by_year["period"] = pd.cut(freq_by_year["year"], bins=bins,labels=labels,include_lowest=True)
# count term frequency by year
terms_by_year = pd.DataFrame([ (year, term, counts[term]) for year, counts in cfd_year.items() for term in terms ],
columns = ["year", "term", "count"])
terms_by_year["year"] = pd.to_numeric(terms_by_year["year"])
#terms_by_year["term"].astype("category")
terms_by_year["count"] = pd.to_numeric(terms_by_year["count"])
terms_by_year["period"] = pd.cut(terms_by_year["year"],bins=bins,labels=labels,include_lowest=True)
terms_by_year.head()
| year | term | count | period | |
|---|---|---|---|---|
| 0 | -9 | gens | 0 | -50-150 |
| 1 | -9 | natio | 0 | -50-150 |
| 2 | -9 | civitas | 0 | -50-150 |
| 3 | -9 | populus | 10 | -50-150 |
| 4 | -9 | urbs | 7 | -50-150 |
fig = px.scatter(terms_by_year,x="year", y="count", color="term")
fig.update_layout(title="Frequency of the terms by year")
fig.show()
# by period (ppm)
import numpy as np
freq_by_period = freq_by_year.groupby(["period"], as_index=False).agg({'count' : np.sum}) #all tokens by period
terms_by_period = terms_by_year.groupby(["term","period"], as_index=False).agg({'count' : np.sum})
counts_by_period = pd.merge(terms_by_period, freq_by_period, on=["period"], how="right")["count_y"]
terms_by_period["ppm"] = ( terms_by_period["count"] / counts_by_period ) * 1000000
terms_by_period.head()
| term | period | count | ppm | |
|---|---|---|---|---|
| 0 | civitas | -450--250 | 0 | 0.000000e+00 |
| 1 | civitas | -250--50 | 395 | 5.316285e+05 |
| 2 | civitas | -50-150 | 1598 | 2.150740e+06 |
| 3 | civitas | 150-350 | 509 | 6.850606e+05 |
| 4 | civitas | 350-550 | 977 | 1.314939e+06 |
fig = px.line(terms_by_period,x="period", y="ppm", color="term", color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Frequency of the terms by period (ppm)")
fig.show()
# by period (raw counts)
fig = px.bar(terms_by_period,x="period", y="count", color="term", color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Frequency of the terms by period",barmode='stack')
fig.show()
# frequency by century
century = [-500,-400,-300,-200,-100,0,100,200,300,400,500,600,700,800,900,1000]
terms_by_year["century"] = pd.cut(terms_by_year["year"], bins=century, labels=range(-5,10,1))
terms_by_year.head()
| year | term | count | period | century | |
|---|---|---|---|---|---|
| 0 | -9 | gens | 0 | -50-150 | -1 |
| 1 | -9 | natio | 0 | -50-150 | -1 |
| 2 | -9 | civitas | 0 | -50-150 | -1 |
| 3 | -9 | populus | 10 | -50-150 | -1 |
| 4 | -9 | urbs | 7 | -50-150 | -1 |
terms_by_century = terms_by_year.groupby(["century", "term"]).agg({'count':np.sum}).reset_index()
fig = px.line(terms_by_century,x="century", y="count", color="term", color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Frequency of the terms by century")
fig.show()
#by fileid - prepare freq distribution
import itertools
cfd_bytext = nltk.ConditionalFreqDist()
for file in [ x for x in latinise.fileids() ]:
for word in latinise.words(file):
cfd_bytext[file][word] +=1
terms_by_text = pd.DataFrame([ (filename, term, counts[term]) for filename, counts in cfd_bytext.items() for term in terms ],
columns = ["filename", "term", "count"])
terms_by_text = pd.merge(terms_by_text, metadata, on="filename")
terms_by_text['ppm'] = (terms_by_text['count'] / terms_by_text['no_tokens']) * 1000000
terms_by_text.head()
# top works by period - raw
top_terms_by_text = terms_by_text.sort_values('count', ascending=False).groupby(["period","term"]).head(5).sort_values("count", ascending=False)
fig = px.bar(top_terms_by_text,x="id", y="count", color="term", facet_col="period", facet_row="term", text="title", facet_col_wrap=2, category_orders={"period":labels},
height=800, hover_data=["id", "creator", "title", "period"], color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Texts with max number of terms by period", uniformtext_minsize=10, uniformtext_mode='hide')
fig.update_xaxes(matches=None, visible=False)
fig.update_yaxes(matches=None)
fig.show()
# top works by period - ppm
top_terms_by_text = terms_by_text.sort_values('ppm', ascending=False).groupby(["period","term"]).head(5).sort_values("ppm", ascending=False)
fig = px.bar(top_terms_by_text,x="id", y="ppm", color="term", facet_col="period", facet_row="term", text="title", facet_col_wrap=2, category_orders={"period":labels},
height=1200, hover_data=["id", "creator", "title", "period"], color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Texts with max number of terms by period (ppm)",
uniformtext_minsize=14)
fig.update_xaxes(matches=None, visible=False)
fig.update_traces(textposition='inside', textfont_size=16)
fig.show()
#top_terms_by_author
top_terms_by_author = terms_by_text.sort_values('count', ascending=False).groupby(["creator","term","period"], observed=True).agg({'count':np.sum}).reset_index().sort_values("count", ascending=False)
fig = px.bar(top_terms_by_author, x="creator", y="count", color="term", facet_col="period", facet_row="term", text="creator", facet_col_wrap=2, category_orders={"period":labels},
height=800, hover_data=["creator", "period"], color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Authors with max number of terms by period", uniformtext_minsize=8)
fig.update_xaxes(matches=None, visible=False)
fig.update_traces(textposition='outside', textfont_size=14)
fig.update_yaxes(matches=None)
fig.show()
# terms by genre
top_terms_by_text.head()
top_terms_by_genre = terms_by_text.sort_values('count', ascending=False).groupby(["term", "type","period"]).head(5).sort_values("count", ascending=False)
top_terms_by_genre.head()
fig = px.bar(top_terms_by_genre,x="period", y="count", color="term", facet_col="period", facet_row="type", facet_col_wrap=2, category_orders={"period":labels},
height=800, hover_data=["id", "creator", "title", "period"], color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Texts with max number of terms by period (raw)", uniformtext_minsize=8)
fig.update_xaxes(matches=None, visible=False)
fig.update_yaxes(matches=None)
fig.show()